GitHub in OUP “Bioinformatics”
# Build query with public repo to include and list to exclude
ti_ab <- '[Title/Abstract]'
build_query <- function(incl_repo, excl_repos) {
rtrn <- paste(incl_repo, ti_ab, sep = "")
for(repo in excl_repos) {
rtrn <- paste(rtrn, " NOT ", repo, ti_ab, sep = "")
}
rtrn <- paste(rtrn, 'AND "Bioinformatics (Oxford, England)"[Journal]')
rtrn
}
# Get count of results for query and year
get_res_count <- function(query, year) {
tryCatch({
res <- EUtilsSummary(query = query,
db = "pubmed",
mindate = paste(year, "/01/01", sep = ""),
maxdate = paste(year, "/12/31", sep = ""),
retmax = 5000,
datetype = "ppdt")
QueryCount(res)
}, error = function(e) 0)
}
# Specific queries
num_abs_year <- function(incl_repo, excl_repos, year) get_res_count(build_query(incl_repo, excl_repos), year)
total_bioinf_year <- function(year) get_res_count('"Bioinformatics (Oxford, England)"[Journal]', year)
make_count_table <- function(years, repos) {
rtrn <- data.frame(repo = character(), year = integer(), num_abstracts = integer(), total_articles = integer())
for(year in years) {
total_articles <- total_bioinf_year(year)
for(repo in repos) {
num_abstracts <- num_abs_year(repo, setdiff(repos, repo), year)
rtrn <- rbind(rtrn, data.frame(repo = repo, year = year, num_abstracts = num_abstracts, total_articles = total_articles))
}
}
rtrn
}
count_table <- make_count_table(2009:2017, c("GitHub", "Bitbucket", "SourceForge"))
count_table <- rbind(count_table,
data.frame(count_table %>%
group_by(year) %>%
mutate(other = total_articles - sum(num_abstracts)) %>%
select(year, other) %>%
unique() %>%
mutate(repo = "Abstracts with none of these") %>%
rename(num_abstracts = other) %>%
select(repo, year, num_abstracts) %>%
left_join(count_table %>%
select(year, total_articles) %>%
unique(),
by = "year")))
count_table$repo <- factor(count_table$repo, levels = c("Abstracts with none of these", "Bitbucket", "SourceForge", "GitHub"))
ggplot() +
geom_bar(data = count_table %>%
mutate(year = as.character(year)),
aes(y = num_abstracts, x = year, fill = repo),
stat = "identity") +
xlab("Year of publication in Bioinformatics (Oxford University Press)") +
ylab("Number of abstracts containing repository name") +
guides(fill = guide_legend(title = "Repository")) +
theme_bw() +
theme(axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
legend.title = element_text(size = 16),
legend.text = element_text(size = 14),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14)) +
scale_fill_manual(values=c("gray", "#4DAF4A", "#377EB8", "#E41A1C"))

ggsave("abs_bioinf_repo.png")
## Saving 10 x 8 in image
ggsave("abs_bioinf_repo.pdf")
## Saving 10 x 8 in image
Topic modeling
plt_data_topics <- as.tbl(repo_data_main) %>%
select(commits, commit_authors, forks_count, subscribers_count,
stargazers_count, num_citations_per_week_pmc_minus_2_years,
total_file_size_no_data, num_files_no_data, contains("topic")) %>%
dplyr::rename(
Commits = commits,
`Commit authors` = commit_authors,
Forks = forks_count,
Subscribers = subscribers_count,
Stargazers = stargazers_count,
`Mean PMC citations / week` = num_citations_per_week_pmc_minus_2_years,
`Total files` = num_files_no_data
)
# Change NA's
change_na <- function(x, c) {
if(is.na(x)) c
else x
}
na_to_zero <- function(x) change_na(x, 0)
na_to_one <- function(x) change_na(x, 1)
plt_data_topics[["Commits"]] <- sapply(plt_data_topics[["Commits"]], na_to_zero)
plt_data_topics[["Commit authors"]] <- sapply(plt_data_topics[["Commit authors"]], na_to_zero)
plt_data_topics[["Mean PMC citations / week"]] <- sapply(plt_data_topics[["Mean PMC citations / week"]], na_to_zero)
plt_data_topics[["total_file_size_no_data"]] <- sapply(plt_data_topics[["total_file_size_no_data"]], na_to_one)
plt_data_topics <-
plt_data_topics %>%
dplyr::mutate(`Megabytes of code` = total_file_size_no_data / 1000000,
`Forks + 1` = Forks + 1,
`Subscribers + 1` = Subscribers + 1,
`Stargazers + 1` = Stargazers + 1,
`1 + mean PMC citations / week` = `Mean PMC citations / week` + 1) %>%
select(-total_file_size_no_data, -Forks, -Subscribers, -Stargazers, -`Mean PMC citations / week`) %>%
melt(id.vars = c("Commits", "Commit authors", "Forks + 1",
"Subscribers + 1", "Stargazers + 1", "1 + mean PMC citations / week",
"Megabytes of code", "Total files")) %>%
filter(value) %>%
as.tbl() %>%
select(-value) %>%
dplyr::rename(Topic = variable) %>%
melt(id.vars = "Topic")
# Add number of repos per topic
topic_cols <- as.character(unique(plt_data_topics$Topic))
topic_ct <- unlist(map(topic_cols, function(x) sum(repo_data_main[[x]])))
names(topic_ct) <- topic_cols
topic_with_ct <- unlist(map(plt_data_topics$Topic, function(x) {
paste(x, " (N=", topic_ct[[x]], ")", sep = "")
}))
plt_data_topics$Topic <- topic_with_ct
plt_data_topics <-
plt_data_topics %>%
mutate(Topic = gsub("topic_", "", Topic)) %>%
mutate(Topic = gsub("_", " ", Topic)) %>%
mutate(Topic = gsub("RNA.seq", "RNA-seq", Topic))
# Make the plot
ggplot(plt_data_topics,
aes(x = variable, y = value, fill = factor(Topic))) +
geom_boxplot(notch = T) +
theme_bw() +
guides(fill = guide_legend(title="Abstract includes topic")) +
theme(legend.text = element_text(size=13),
legend.title = element_text(size = 15),
axis.text.y = element_text(size = 14),
axis.text.x = element_blank(),
axis.title = element_blank(),
strip.text = element_text(size = 14),
legend.position = c(0.835, 0.17)) +
scale_fill_brewer(palette="Dark2") +
facet_wrap(~variable, scales = "free", ncol = 3) +
scale_y_log10(breaks = c(0.1, 1, 10, 100, 1000, 10000), labels = comma)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 146 rows containing non-finite values (stat_boxplot).
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

ggsave("topics.png")
## Saving 12 x 8 in image
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 146 rows containing non-finite values (stat_boxplot).
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
ggsave("topics.pdf")
## Saving 12 x 8 in image
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 146 rows containing non-finite values (stat_boxplot).
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
Commits after publication
plt_data_commits_after_pub <- as.tbl(repo_data_main) %>%
select(commits, commit_authors, forks_count, subscribers_count,
stargazers_count, num_citations_per_week_pmc_minus_2_years,
mean_commit_message_len, pct_commits_diff_author_committer,
num_non_committing_authors, commits_after_article_in_pubmed) %>%
filter(!is.na(commits_after_article_in_pubmed)) %>%
dplyr::rename(
`Total commits` = commits,
`Commit authors` = commit_authors,
`Total forks` = forks_count,
`Total subscribers` = subscribers_count,
`Total stargazers` = stargazers_count,
`PMC citations / week` = num_citations_per_week_pmc_minus_2_years,
`Commit message length` = mean_commit_message_len,
`Pct outside commits` = pct_commits_diff_author_committer,
`Outside commit authors` = num_non_committing_authors,
`Commits after\npublication` = commits_after_article_in_pubmed
) %>%
melt(id.vars = "Commits after\npublication")
# Get smallest positive value of each variable so we can take logs
min_pos <- plt_data_commits_after_pub %>%
filter(value > 0) %>%
group_by(variable) %>%
dplyr::summarize(min_pos = min(value))
# Remove top outliers for plot
p_outlier <- 1 # 1 means no filtering for outliers
outlier_cutoff <- plt_data_commits_after_pub %>%
group_by(variable) %>%
dplyr::summarize(outlier_cutoff = quantile(value, probs = p_outlier, na.rm = T))
plt_data_commits_after_pub <- plt_data_commits_after_pub %>%
left_join(min_pos, by = "variable") %>%
left_join(outlier_cutoff, by = "variable")
# Replace 0's and NA's by minimum positive value
plt_data_commits_after_pub$value_pos <- apply(plt_data_commits_after_pub, 1, function(row) {
val <- as.numeric(row["value"])
mp <- as.numeric(row["min_pos"])
if(is.na(val)) mp
else max(val, mp)
})
# Replace NA's by minimum positive value
plt_data_commits_after_pub$value_non_na <- apply(plt_data_commits_after_pub, 1, function(row) {
val <- as.numeric(row["value"])
mp <- as.numeric(row["min_pos"])
if(is.na(val)) mp
else val
})
# Change true/false to yes/no
label_yes <- paste("Yes (N=", nrow(repo_data_main %>% filter(commits_after_article_in_pubmed)), ")", sep = "")
label_no <- paste("No (N=", nrow(repo_data_main %>% filter(!commits_after_article_in_pubmed)), ")", sep = "")
plt_data_commits_after_pub[["Commits after\npublication"]] <-
unlist(map(plt_data_commits_after_pub[["Commits after\npublication"]],
function(x) {
if(x) label_yes
else label_no
}))
plt_data_commits_after_pub <- plt_data_commits_after_pub %>%
filter(value_non_na <= outlier_cutoff) %>%
select(`Commits after\npublication`, variable, value_non_na, value_pos)
# T test for each variable
t_test_commits_after_pub <-
plt_data_commits_after_pub %>%
group_by(variable) %>%
dplyr::summarize(
p = t.test(value_non_na ~ `Commits after\npublication`, paired = FALSE, alternative = "two.sided")$p.value,
x = 0.5,
y = max(value_non_na)
)
ggplot(plt_data_commits_after_pub, aes(variable, value_pos)) +
geom_boxplot(aes(fill = `Commits after\npublication`)) +
facet_wrap(~variable, scales = "free", ncol = 3) +
scale_y_log10(labels = comma) +
theme_bw() +
theme(legend.text = element_text(size=14),
legend.title = element_text(size = 16),
axis.text.y = element_text(size = 14),
axis.text.x = element_blank(),
axis.title = element_blank(),
strip.text = element_text(size = 14)) +
geom_label(data = t_test_commits_after_pub,
aes(x = x, y = y, label = paste("p =", formatC(p, format = "e", digits = 1))),
fill = "white",
size = 5.5,
label.size = 0,
color = "steelblue",
hjust = 0) +
scale_fill_brewer(palette = "Set2")

ggsave("commits_after_pub.png")
## Saving 11 x 8 in image
ggsave("commits_after_pub.pdf")
## Saving 11 x 8 in image
# Check if relationship between commit message length and commits after publication can be explained by team size
cor.test(repo_data_main$commit_authors, repo_data_main$mean_commit_message_len, method = "pearson")$estimate
## cor
## 0.1442577
cor.test(repo_data_main$commit_authors, repo_data_main$mean_commit_message_len, method = "pearson")$p.value
## [1] 1.896942e-09
# Make another version of the figure that is wide without outside commit authors
plt <- ggplot(plt_data_commits_after_pub %>% filter(variable != "Outside commit authors"), aes(variable, value_pos)) +
geom_boxplot(aes(fill = `Commits after\npublication`)) +
facet_wrap(~variable, scales = "free", ncol = 4) +
scale_y_log10() +
theme_bw() +
theme(legend.text = element_text(size=10),
legend.title = element_text(size = 11),
axis.text.y = element_text(size = 10),
axis.text.x = element_blank(),
axis.title = element_blank(),
strip.text = element_text(size = 10)) +
geom_label(data = t_test_commits_after_pub %>% filter(variable != "Outside commit authors"),
aes(x = x, y = y, label = paste("p =", formatC(p, format = "e", digits = 1))),
fill = "white",
label.size = 0,
color = "steelblue",
hjust = 0) +
scale_fill_brewer(palette = "Set2")
plt

Outside contributors
plt_data_outside_contrib <- as.tbl(repo_data_all) %>%
select(forks_count, subscribers_count, stargazers_count, num_non_committing_authors, is_high_profile) %>%
mutate(forks_count = forks_count + 1,
subscribers_count = subscribers_count + 1,
stargazers_count = stargazers_count + 1,
num_non_committing_authors = num_non_committing_authors + 1) %>%
dplyr::rename(
`Total forks + 1` = forks_count,
`Total subscribers + 1` = subscribers_count,
`Total stargazers + 1` = stargazers_count,
`Outside commit authors + 1` = num_non_committing_authors
) %>%
melt(id.vars = c("Outside commit authors + 1", "is_high_profile"))
# Correlation
corr_outside_contrib <-
plt_data_outside_contrib %>%
group_by(variable) %>%
dplyr::summarize(
corr_pearson = cor.test(value, `Outside commit authors + 1`, method = "pearson")$estimate,
pval_pearson = cor.test(value, `Outside commit authors + 1`, method = "pearson")$p.value,
x = 1,
y = 10^(0.93 * log10(max(value)))
)
# Collapse identical records
plt_data_outside_contrib <-
plt_data_outside_contrib %>%
group_by(`Outside commit authors + 1`, is_high_profile, variable, value) %>%
dplyr::summarize(`Num repos` = n())
# Make the plot
ggplot(plt_data_outside_contrib) +
geom_point(aes(size = `Num repos`,
x = `Outside commit authors + 1`,
y = value, col = is_high_profile)) +
geom_point(data = subset(plt_data_outside_contrib, is_high_profile),
aes(size = `Num repos`,
x = `Outside commit authors + 1`,
y = value, col = is_high_profile)) +
scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
theme_bw() +
guides(color = guide_legend(title = "Dataset")) +
theme(legend.text = element_text(size=14),
legend.title = element_text(size = 16),
axis.text = element_text(size = 14),
strip.text = element_text(size = 14),
axis.title.y = element_blank()) +
facet_wrap(~variable, scales = "free", ncol = 3) +
scale_y_log10(breaks = c(1, 10, 100, 1000, 10000), labels = comma) +
scale_x_log10() +
geom_label(data = corr_outside_contrib,
aes(x = x, y = y, label = paste("r = ", signif(corr_pearson, digits = 2), "\np = ", formatC(pval_pearson, format = "e", digits = 1), sep = "")),
fill = "white",
label.size = 0,
color = "mediumpurple4",
hjust = 0)

ggsave("outside_contrib.png")
## Saving 10 x 3 in image
ggsave("outside_contrib.pdf")
## Saving 10 x 3 in image
# How many repos have outside contributors?
nrow(repo_data_main %>% filter(num_non_committing_authors > 0)) / nrow(repo_data_main)
## [1] 0.1412791
nrow(repo_data_high_prof %>% filter(num_non_committing_authors > 0)) / nrow(repo_data_high_prof)
## [1] 0.6956522
Commit authors and community
plt_data_devs_community <- as.tbl(repo_data_all) %>%
select(forks_count, subscribers_count, stargazers_count, commit_authors, is_high_profile) %>%
mutate(forks_count = forks_count + 1,
subscribers_count = subscribers_count + 1,
stargazers_count = stargazers_count + 1) %>%
dplyr::rename(
`Total forks + 1` = forks_count,
`Total subscribers + 1` = subscribers_count,
`Total stargazers + 1` = stargazers_count,
`Commit authors` = commit_authors,
`High profile` = is_high_profile
) %>%
melt(id.vars = c("Commit authors", "High profile"))
# Correlation
corr_devs_community <-
plt_data_devs_community %>%
group_by(variable) %>%
dplyr::summarize(
corr_pearson = cor.test(value, `Commit authors`, method = "pearson")$estimate,
pval_pearson = cor.test(value, `Commit authors`, method = "pearson")$p.value,
x = 1,
y = 10^(0.93 * log10(max(value)))
)
# Collapse identical records
plt_data_devs_community <-
plt_data_devs_community %>%
group_by(variable, `Commit authors`, `High profile`, value) %>%
dplyr::summarize(`Num repos` = n())
ggplot(plt_data_devs_community) +
geom_point(aes(x = `Commit authors`,
y = value,
col = `High profile`,
size = `Num repos`)) +
geom_point(data = subset(plt_data_devs_community, `High profile`),
aes(x = `Commit authors`,
y = value,
col = `High profile`,
size = `Num repos`)) +
scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
theme_bw() +
guides(color = guide_legend(title = "Dataset")) +
theme(legend.text = element_text(size=14),
legend.title = element_text(size = 16),
axis.text = element_text(size = 14),
axis.title.y = element_blank(),
axis.title.x = element_text(size = 16),
strip.text = element_text(size = 14)) +
facet_wrap(~variable, scales = "free", ncol = 3) +
scale_y_log10(breaks = c(1, 10, 100, 1000), labels = comma) +
scale_x_log10() +
geom_label(data = corr_devs_community,
aes(x = x, y = y, label = paste("r = ", signif(corr_pearson, digits = 2), "\np = ", formatC(pval_pearson, format = "e", digits = 1), sep = "")),
fill = "white",
label.size = 0,
color = "mediumpurple4",
hjust = 0)
## Warning: Removed 4 rows containing missing values (geom_point).

ggsave("commit_authors.png")
## Saving 11 x 4 in image
## Warning: Removed 4 rows containing missing values (geom_point).
ggsave("commit_authors.pdf")
## Saving 11 x 4 in image
## Warning: Removed 4 rows containing missing values (geom_point).
Commits
plt_data_commits <- as.tbl(repo_data_all) %>%
select(commits, mean_commits_per_month, consecutive_months_with_commits,
commit_span_days, mean_files_added_per_month, num_days_new_files_added,
consecutive_months_no_commits, is_high_profile) %>%
mutate(consecutive_months_no_commits = consecutive_months_no_commits + 1) %>%
dplyr::rename(
`Total commits` = commits,
`Mean commits/month` = mean_commits_per_month,
`Max cons. months with commits` = consecutive_months_with_commits,
`Project duration (days)` = commit_span_days,
`Mean new files per month` = mean_files_added_per_month,
`Days with new files added` = num_days_new_files_added,
`1 + max cons. months no commits` = consecutive_months_no_commits,
`High profile` = is_high_profile
) %>%
melt(id.vars = c("Total commits", "High profile"))
# Correlation
corr_commits <-
plt_data_commits %>%
group_by(variable) %>%
dplyr::summarize(
corr_pearson = cor.test(value, `Total commits`, method = "pearson")$estimate,
pval_pearson = cor.test(value, `Total commits`, method = "pearson")$p.value,
x = 1,
y = 10^(0.93 * log10(max(value, na.rm = T)))
)
# Collapse identical records
plt_data_commits <-
plt_data_commits %>%
group_by(`Total commits`, variable, value, `High profile`) %>%
dplyr::summarize(`Num repos` = n())
ggplot(plt_data_commits) +
geom_point(aes(x = `Total commits`,
y = value,
col = `High profile`,
size = `Num repos`)) +
geom_point(data = subset(plt_data_commits, `High profile`),
aes(x = `Total commits`,
y = value,
col = `High profile`,
size = `Num repos`)) +
scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
theme_bw() +
guides(color = guide_legend(title = "Dataset")) +
theme(legend.text = element_text(size=14),
legend.title = element_text(size = 16),
axis.text = element_text(size = 14),
axis.title.y = element_blank(),
axis.title.x = element_text(size = 16),
strip.text = element_text(size = 14)) +
facet_wrap(~variable, scales = "free", ncol = 3) +
scale_y_log10(breaks = c(1, 10, 100, 1000), labels = comma) +
scale_x_log10(breaks = c(1, 10, 100, 1000, 10000), labels = comma) +
geom_label(data = corr_commits,
aes(x = x, y = y, label = paste("r = ", signif(corr_pearson, digits = 2), "\np = ", formatC(pval_pearson, format = "e", digits = 1), sep = "")),
fill = "white",
label.size = 0,
color = "mediumpurple4",
hjust = 0)
## Warning: Removed 20 rows containing missing values (geom_point).

ggsave("commits.png")
## Saving 13 x 6 in image
## Warning: Removed 20 rows containing missing values (geom_point).
ggsave("commits.pdf")
## Saving 13 x 6 in image
## Warning: Removed 20 rows containing missing values (geom_point).
Languages: file sizes and lines of code
top_langs_as_header <- sapply(top_langs, format_lang_as_header)
lang_cols <- unname(c(sapply(top_langs_as_header, function(x) paste("num_files_", x, sep = "")),
sapply(top_langs_as_header, function(x) paste("mean_lines_code_", x, sep = ""))))
plt_data_langs <- data.frame(`Number of files` = integer(),
`Mean lines of code per file` = numeric(),
is_high_profile = logical(),
lang = character())
for(lang in top_langs_as_header) {
col_nf <- paste("num_files_", lang, sep = "")
col_loc <- paste("mean_lines_code_", lang, sep = "")
plt_data_langs <- rbind(plt_data_langs,
repo_data_all %>%
select(!!as.name(col_nf), !!as.name(col_loc), is_high_profile) %>%
mutate(lang = lang) %>%
rename(`Number of files` = !!as.name(col_nf),
`Mean lines of code per file` = !!as.name(col_loc)) %>%
filter(`Number of files` > 0 & `Mean lines of code per file` > 0))
}
ggplot(plt_data_langs) +
geom_point(aes(x = `Number of files`,
y = `Mean lines of code per file`,
col = is_high_profile)) +
geom_point(data = subset(plt_data_langs, is_high_profile),
aes(x = `Number of files`,
y = `Mean lines of code per file`,
col = is_high_profile)) +
scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
theme_bw() +
guides(color = guide_legend(title = "Dataset")) +
theme(legend.text = element_text(size=14),
legend.title = element_text(size = 16),
axis.text = element_text(size = 14),
strip.text = element_text(size = 14),
axis.title = element_text(size = 16)) +
facet_wrap(~lang, scales = "fixed", ncol = 5) +
scale_y_log10(breaks = c(1, 10, 100, 1000, 10000), labels = c("1", "10", "100", "1K", "10K")) +
scale_x_log10(breaks = c(1, 10, 100, 1000, 10000), labels = c("1", "10", "100", "1K", "10K"))

ggsave("languages.png")
## Saving 13 x 8 in image
ggsave("languages.pdf")
## Saving 13 x 8 in image
Bytes of code by language
lang_cols <- unname(c(sapply(top_langs_as_header, function(x) paste("bytes_", x, sep = ""))))
plt_data_lang_bytes <- data.frame(bytes = integer(),
is_high_profile = logical(),
lang = character())
for(lang in top_langs_as_header) {
col_b <- paste("bytes_", lang, sep = "")
plt_data_lang_bytes <- rbind(plt_data_lang_bytes,
repo_data_all %>%
select(!!as.name(col_b), is_high_profile) %>%
mutate(lang = lang) %>%
rename(bytes = !!as.name(col_b)))
}
plt_data_lang_bytes$Dataset <- sapply(plt_data_lang_bytes$is_high_profile,
function(x) {
if(x) "High profile repos"
else "Main repos"
})
ggplot(plt_data_lang_bytes,
aes(x = lang, y = bytes / 1000000, fill = Dataset)) +
geom_bar(stat = "identity") +
scale_fill_manual(values=c(color_high_prof, color_main)) +
theme_bw() +
theme(axis.text = element_text(size = 14),
strip.text = element_text(size = 14),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none",
axis.title = element_text(size = 16),
plot.margin = margin(10, 10, 10, 70)) +
xlab("Language") +
ylab("Total megabytes of code") +
facet_wrap(~Dataset, scales = "free")

ggsave("bytes_by_lang.pdf")
## Saving 12 x 7 in image
ggsave("bytes_by_lang.png")
## Saving 12 x 7 in image
Years by topic
plt_data_years_by_topic <- repo_data_main %>%
select(first_commit, date_pubmed, contains("topic")) %>%
mutate(year_first_commit = year(first_commit),
year_pubmed = year(date_pubmed)) %>%
select(-first_commit, -date_pubmed) %>%
melt(id.vars = c("year_first_commit", "year_pubmed")) %>%
filter(value) %>%
group_by(year_first_commit, year_pubmed, variable) %>%
dplyr::summarize(`Num repos` = n()) %>%
filter(!is.na(year_first_commit) & !is.na(year_pubmed)) %>%
rename(Topic = variable)
plt_data_years_by_topic$Topic <- sapply(plt_data_years_by_topic$Topic, function(x) {
gsub("_", " ", gsub("topic_", "", gsub("RNA.seq", "RNA-seq", x)))
})
min_year <- min(c(plt_data_years_by_topic$year_first_commit, plt_data_years_by_topic$year_pubmed))
max_year <- max(c(plt_data_years_by_topic$year_first_commit, plt_data_years_by_topic$year_pubmed))
ggplot(plt_data_years_by_topic) +
geom_point(aes(x = year_first_commit,
y = year_pubmed,
size = `Num repos`)) +
theme_bw() +
theme(legend.text = element_text(size=14),
legend.title = element_text(size = 16),
axis.text = element_text(size = 14),
strip.text = element_text(size = 14),
axis.title = element_text(size = 16)) +
facet_wrap(~Topic, scales = "fixed", ncol = 2) +
xlim(min_year, max_year) +
ylim(min_year, max_year) +
xlab("Year of initial commit") +
ylab("Year paper in PubMed")

ggsave("topics_by_year.pdf")
## Saving 10 x 11 in image
ggsave("topics_by_year.png")
## Saving 10 x 11 in image
Licenses
plt_data_licenses <- repo_data_all %>%
select(license, is_high_profile)
plt_data_licenses$is_high_profile <- as.factor(plt_data_licenses$is_high_profile)
levels(plt_data_licenses$is_high_profile) <- c("Main repos", "High profile repos")
ggplot(plt_data_licenses, aes(x = license)) + geom_histogram(stat = "count") +
facet_wrap(~is_high_profile, scales = "free") +
theme_bw() +
theme(axis.text.y = element_text(size = 14),
axis.text.x = element_text(angle = 45, hjust = 1, size = 14),
strip.text = element_text(size = 14),
axis.title = element_text(size = 16)) +
ylab("Number of repos") +
xlab("License")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

ggsave("licenses.pdf")
## Saving 10 x 5 in image
ggsave("licenses.png")
## Saving 10 x 5 in image
Language features
exec_method <- list_tabledata(project = proj_main,
dataset = ds_lang,
table = table_exec_method) %>%
mutate(language = tolower(language))
type_system <- list_tabledata(project = proj_main,
dataset = ds_lang,
table = table_type_system) %>%
mutate(language = tolower(language))
lang_features <- data.frame(language = tolower(top_langs),
lang_header = top_langs_as_header) %>%
left_join(exec_method, by = "language") %>%
left_join(type_system, by = "language")
## Warning: Column `language` joining factor and character vector, coercing
## into character vector
sum_lang <- function(prefix, keep_rows) {
sapply(lang_features$lang_header,
function(x) {
sum(repo_data_all[which(keep_rows),
paste(prefix, x, sep="")])
})}
lang_features$bytes_high_profile <- sum_lang("bytes_", repo_data_all$is_high_profile)
lang_features$bytes_main <- sum_lang("bytes_", !repo_data_all$is_high_profile)
lang_features$files_high_profile <- sum_lang("num_files_", repo_data_all$is_high_profile)
lang_features$files_main <- sum_lang("num_files_", !repo_data_all$is_high_profile)
lang_features$exec <- ""
lang_features$type <- ""
for(i in 1:nrow(lang_features)) {
# Execution method
interpreted <- isTRUE(lang_features[i, "interpreted"])
compiled <- isTRUE(lang_features[i, "compiled"])
if(interpreted && compiled) lang_features[i, "exec"] <- "Both"
else if(interpreted) lang_features[i, "exec"] <- "Interpreted"
else if(compiled) lang_features[i, "exec"] <- "Compiled"
else lang_features[i, "exec"] <- NA
# Type system
type <- NA
strength <- lang_features[i, "strength"]
system <- lang_features[i, "system"]
safety <- lang_features[i, "safety"]
if(!is.na(strength)) type <- capitalize(strength)
if(!is.na(system)) type <- paste(type, system)
if(!is.na(safety)) type <- paste(type, safety)
lang_features[i, "type"] <- type
}
plt_data_lang_features <- lang_features %>% select(exec, type, bytes_high_profile,
bytes_main, files_high_profile, files_main) %>%
filter(!is.na(exec) & !is.na(type)) %>%
melt(id.vars = c("exec", "type")) %>%
mutate(is_high_profile = grepl("high_profile", variable)) %>%
mutate(variable = gsub("_main", "", gsub("_high_profile", "", variable)))
plt_data_lang_features$var <- ""
plt_data_lang_features$color <- ""
for(i in 1:nrow(plt_data_lang_features)) {
variable <- plt_data_lang_features[i, "variable"]
high_prof <- plt_data_lang_features[i, "is_high_profile"]
var <- capitalize(variable)
if(high_prof) {
var <- paste(var, "- high profile repos")
color <- "h"
} else {
var <- paste(var, "- main repos")
color <- "m"
}
plt_data_lang_features[i, "var"] <- var
plt_data_lang_features[i, "color"] <- color
}
plt_data_lang_features <- plt_data_lang_features %>%
select(exec, type, var, value, color) %>%
group_by(var) %>%
mutate(sum_var = sum(value)) %>%
ungroup() %>%
mutate(val_normalized = value / sum_var)
ggplot(plt_data_lang_features) +
geom_point(aes(x = exec,
y = type,
size = val_normalized,
col = color)) +
scale_color_manual(values=c(color_high_prof, color_main)) +
theme_bw() +
theme(axis.text = element_text(size = 14),
strip.text = element_text(size = 14),
axis.title = element_text(size = 16),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none") +
facet_wrap(~var, scales = "free", ncol = 2) +
xlab("Execution mode") +
ylab("Type system")

ggsave("bytes_by_type_system.pdf")
## Saving 10 x 8 in image
ggsave("bytes_by_type_system.png")
## Saving 10 x 8 in image
Session info
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2 jsonlite_1.5 scales_0.5.0
## [4] RColorBrewer_1.1-2 RISmed_2.1.7 purrr_0.2.4
## [7] Hmisc_4.1-1 Formula_1.2-2 survival_2.41-3
## [10] lattice_0.20-35 bigrquery_0.4.1 lubridate_1.7.1
## [13] ggplot2_2.2.1 dplyr_0.7.4 reshape2_1.4.3
##
## loaded via a namespace (and not attached):
## [1] splines_3.4.3 colorspace_1.3-2 htmltools_0.3.6
## [4] yaml_2.1.16 base64enc_0.1-3 rlang_0.1.6
## [7] pillar_1.0.1 foreign_0.8-69 glue_1.2.0
## [10] DBI_0.7 bindr_0.1 plyr_1.8.4
## [13] stringr_1.2.0 munsell_0.4.3 gtable_0.2.0
## [16] htmlwidgets_0.9 evaluate_0.10.1 labeling_0.3
## [19] latticeExtra_0.6-28 knitr_1.18 curl_3.1
## [22] htmlTable_1.11.2 Rcpp_0.12.14 acepack_1.4.1
## [25] openssl_0.9.9 backports_1.1.2 checkmate_1.8.5
## [28] gridExtra_2.3 digest_0.6.13 stringi_1.1.6
## [31] grid_3.4.3 rprojroot_1.3-2 tools_3.4.3
## [34] magrittr_1.5 lazyeval_0.2.1 tibble_1.4.1
## [37] cluster_2.0.6 pkgconfig_2.0.1 Matrix_1.2-12
## [40] data.table_1.10.4-3 assertthat_0.2.0 rmarkdown_1.8
## [43] httr_1.3.1 rstudioapi_0.7 R6_2.2.2
## [46] rpart_4.1-11 nnet_7.3-12 compiler_3.4.3